You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ambari.apache.org by sw...@apache.org on 2016/03/15 21:01:58 UTC
ambari git commit: AMBARI-15416. Grafana password is required field
after upgrade from 2.2.0 to 2.2.2. Fixed typo. (swagle)
Repository: ambari
Updated Branches:
refs/heads/trunk 40a906ec2 -> afb69f02f
AMBARI-15416. Grafana password is required field after upgrade from 2.2.0 to 2.2.2. Fixed typo. (swagle)
Project: http://git-wip-us.apache.org/repos/asf/ambari/repo
Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/afb69f02
Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/afb69f02
Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/afb69f02
Branch: refs/heads/trunk
Commit: afb69f02f416afef9d0c69444e974dd33a9fd08b
Parents: 40a906e
Author: Siddharth Wagle <sw...@hortonworks.com>
Authored: Tue Mar 15 12:55:28 2016 -0700
Committer: Siddharth Wagle <sw...@hortonworks.com>
Committed: Tue Mar 15 12:55:28 2016 -0700
----------------------------------------------------------------------
.../metrics/timeline/PhoenixHBaseAccessor.java | 7 +-
.../common-services/HDFS/2.1.0.2.0/alerts.json | 403 +++++++++++++++++--
.../package/alerts/alert_metrics_deviation.py | 49 ++-
3 files changed, 410 insertions(+), 49 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/ambari/blob/afb69f02/ambari-metrics/ambari-metrics-timelineservice/src/main/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/PhoenixHBaseAccessor.java
----------------------------------------------------------------------
diff --git a/ambari-metrics/ambari-metrics-timelineservice/src/main/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/PhoenixHBaseAccessor.java b/ambari-metrics/ambari-metrics-timelineservice/src/main/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/PhoenixHBaseAccessor.java
index d6ee9d2..df2d619 100644
--- a/ambari-metrics/ambari-metrics-timelineservice/src/main/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/PhoenixHBaseAccessor.java
+++ b/ambari-metrics/ambari-metrics-timelineservice/src/main/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/PhoenixHBaseAccessor.java
@@ -341,10 +341,6 @@ public class PhoenixHBaseAccessor {
boolean enableNormalizer = hbaseConf.getBoolean("hbase.normalizer.enabled", true);
boolean enableFifoCompaction = metricsConf.getBoolean("timeline.metrics.hbase.fifo.compaction.enabled", true);
- if (!enableNormalizer && !enableFifoCompaction) {
- return;
- }
-
HBaseAdmin hBaseAdmin = null;
try {
hBaseAdmin = dataSource.getHBaseAdmin();
@@ -358,8 +354,7 @@ public class PhoenixHBaseAccessor {
boolean modifyTable = false;
HTableDescriptor tableDescriptor = hBaseAdmin.getTableDescriptor(tableName.getBytes());
- if (enableNormalizer &&
- !tableDescriptor.isNormalizationEnabled()) {
+ if (enableNormalizer && !tableDescriptor.isNormalizationEnabled()) {
tableDescriptor.setNormalizationEnabled(true);
LOG.info("Enabling normalizer for " + tableName);
modifyTable = true;
http://git-wip-us.apache.org/repos/asf/ambari/blob/afb69f02/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/alerts.json
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/alerts.json b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/alerts.json
index 2a6229c..2f13cd6 100644
--- a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/alerts.json
+++ b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/alerts.json
@@ -298,11 +298,11 @@
},
"warning": {
"text": "Capacity Used:[{2:.0f}%, {0}], Capacity Remaining:[{1}]",
- "value": 80
+ "value": 75
},
"critical": {
"text": "Capacity Used:[{2:.0f}%, {0}], Capacity Remaining:[{1}]",
- "value": 90
+ "value": 80
},
"units" : "%"
},
@@ -522,12 +522,12 @@
}
},
{
- "name": "increase_nn_heap_usage_hourly",
- "label": "NameNode Heap Usage (Hourly)",
- "description": "This service-level alert is triggered if the NN heap usage deviation has grown beyond the specified threshold within a given time interval.",
+ "name": "namenode_service_rpc_queue_latency_hourly",
+ "label": "NameNode Service RPC Queue Latency (Hourly)",
+ "description": "This service-level alert is triggered if the deviation of RPC queue latency on datanode port has grown beyond the specified threshold within a given time interval.",
"interval": 5,
"scope": "ANY",
- "enabled": false,
+ "enabled": true,
"source": {
"type": "SCRIPT",
"path": "HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py",
@@ -556,7 +556,7 @@
{
"name": "metricName",
"display_name": "Metric Name",
- "value": "jvm.JvmMetrics.MemHeapUsedM",
+ "value": "rpc.rpc.datanode.RpcQueueTimeAvgTime",
"type": "STRING",
"description": "The metric to monitor."
},
@@ -575,17 +575,24 @@
"units": "%",
"value": 200,
"threshold": "CRITICAL"
+ },
+ {
+ "name": "minimumValue",
+ "display_name": "Minimum Latency (in seconds)",
+ "value": 30,
+ "type": "NUMERIC",
+ "description": "Minimum latency time to measure (in seconds)."
}
]
}
},
{
- "name": "namenode_service_rpc_latency_hourly",
- "label": "NameNode RPC Latency (Hourly)",
- "description": "This service-level alert is triggered if the Service-RPC latency deviation has grown beyond the specified threshold within a given time interval.",
+ "name": "namenode_client_rpc_queue_latency_hourly",
+ "label": "NameNode Client RPC Queue Latency (Hourly)",
+ "description": "This service-level alert is triggered if the deviation of RPC queue latency on client port has grown beyond the specified threshold within a given time interval.",
"interval": 5,
"scope": "ANY",
- "enabled": false,
+ "enabled": true,
"source": {
"type": "SCRIPT",
"path": "HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py",
@@ -614,7 +621,7 @@
{
"name": "metricName",
"display_name": "Metric Name",
- "value": "rpc.rpc.RpcProcessingTimeAvgTime",
+ "value": "rpc.rpc.client.RpcQueueTimeAvgTime",
"type": "STRING",
"description": "The metric to monitor."
},
@@ -633,17 +640,24 @@
"units": "%",
"value": 200,
"threshold": "CRITICAL"
+ },
+ {
+ "name": "minimumValue",
+ "display_name": "Minimum Latency (in seconds)",
+ "value": 30,
+ "type": "NUMERIC",
+ "description": "Minimum latency time to measure (in seconds)."
}
]
}
},
{
- "name": "namenode_increase_in_storage_capacity_usage_hourly",
- "label": "HDFS Storage Capacity Usage (Hourly)",
- "description": "This service-level alert is triggered if the increase in storage capacity usage deviation has grown beyond the specified threshold within a given time interval.",
+ "name": "namenode_service_rpc_processing_latency_hourly",
+ "label": "NameNode Service RPC Processing Latency (Hourly)",
+ "description": "This service-level alert is triggered if the deviation of RPC latency on datanode port has grown beyond the specified threshold within a given time interval.",
"interval": 5,
"scope": "ANY",
- "enabled": false,
+ "enabled": true,
"source": {
"type": "SCRIPT",
"path": "HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py",
@@ -651,7 +665,7 @@
{
"name": "mergeHaMetrics",
"display_name": "Whether active and stanby NameNodes metrics should be merged",
- "value": "true",
+ "value": "false",
"type": "STRING",
"description": "Whether active and stanby NameNodes metrics should be merged."
},
@@ -672,7 +686,7 @@
{
"name": "metricName",
"display_name": "Metric Name",
- "value": "dfs.FSNamesystem.CapacityUsed",
+ "value": "rpc.rpc.datanode.RpcProcessingTimeAvgTime",
"type": "STRING",
"description": "The metric to monitor."
},
@@ -691,6 +705,78 @@
"units": "%",
"value": 200,
"threshold": "CRITICAL"
+ },
+ {
+ "name": "minimumValue",
+ "display_name": "Minimum Latency (in seconds)",
+ "value": 30,
+ "type": "NUMERIC",
+ "description": "Minimum latency time to measure (in seconds)."
+ }
+ ]
+ }
+ },
+ {
+ "name": "namenode_client_rpc_processing_latency_hourly",
+ "label": "NameNode Client RPC Processing Latency (Hourly)",
+ "description": "This service-level alert is triggered if the deviation of RPC latency on client port has grown beyond the specified threshold within a given time interval.",
+ "interval": 5,
+ "scope": "ANY",
+ "enabled": true,
+ "source": {
+ "type": "SCRIPT",
+ "path": "HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py",
+ "parameters": [
+ {
+ "name": "mergeHaMetrics",
+ "display_name": "Whether active and stanby NameNodes metrics should be merged",
+ "value": "false",
+ "type": "STRING",
+ "description": "Whether active and stanby NameNodes metrics should be merged."
+ },
+ {
+ "name": "interval",
+ "display_name": "Time interval in minutes",
+ "value": 60,
+ "type": "NUMERIC",
+ "description": "Time interval in minutes."
+ },
+ {
+ "name": "appId",
+ "display_name": "AMS application id",
+ "value": "NAMENODE",
+ "type": "STRING",
+ "description": "The application id used to retrieve the metric."
+ },
+ {
+ "name": "metricName",
+ "display_name": "Metric Name",
+ "value": "rpc.rpc.client.RpcProcessingTimeAvgTime",
+ "type": "STRING",
+ "description": "The metric to monitor."
+ },
+ {
+ "name": "metric.deviation.warning.threshold",
+ "display_name": "The standard deviation threshold above which a warning is produced.",
+ "type": "PERCENT",
+ "units": "%",
+ "value": 100,
+ "threshold": "WARNING"
+ },
+ {
+ "name": "metric.deviation.critical.threshold",
+ "display_name": "The standard deviation threshold above which a critical alert is produced.",
+ "type": "PERCENT",
+ "units": "%",
+ "value": 200,
+ "threshold": "CRITICAL"
+ },
+ {
+ "name": "minimumValue",
+ "display_name": "Minimum Latency (in seconds)",
+ "value": 30,
+ "type": "NUMERIC",
+ "description": "Minimum latency time to measure (in seconds)."
}
]
}
@@ -701,7 +787,7 @@
"description": "This service-level alert is triggered if the NN heap usage deviation has grown beyond the specified threshold within a given time interval.",
"interval": 480,
"scope": "ANY",
- "enabled": false,
+ "enabled": true,
"source": {
"type": "SCRIPT",
"path": "HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py",
@@ -739,6 +825,64 @@
"display_name": "The standard deviation threshold above which a warning is produced.",
"type": "PERCENT",
"units": "%",
+ "value": 20,
+ "threshold": "WARNING"
+ },
+ {
+ "name": "metric.deviation.critical.threshold",
+ "display_name": "The standard deviation threshold above which a critical alert is produced.",
+ "type": "PERCENT",
+ "units": "%",
+ "value": 50,
+ "threshold": "CRITICAL"
+ }
+ ]
+ }
+ },
+ {
+ "name": "namenode_service_rpc_processing_latency_daily",
+ "label": "NameNode Service RPC Processing Latency (Daily)",
+ "description": "This service-level alert is triggered if the deviation of RPC latency on datanode port has grown beyond the specified threshold within a given time interval.",
+ "interval": 480,
+ "scope": "ANY",
+ "enabled": true,
+ "source": {
+ "type": "SCRIPT",
+ "path": "HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py",
+ "parameters": [
+ {
+ "name": "mergeHaMetrics",
+ "display_name": "Whether active and stanby NameNodes metrics should be merged",
+ "value": "false",
+ "type": "STRING",
+ "description": "Whether active and stanby NameNodes metrics should be merged."
+ },
+ {
+ "name": "interval",
+ "display_name": "Time interval in minutes",
+ "value": 1440,
+ "type": "NUMERIC",
+ "description": "Time interval in minutes."
+ },
+ {
+ "name": "appId",
+ "display_name": "AMS application id",
+ "value": "NAMENODE",
+ "type": "STRING",
+ "description": "The application id used to retrieve the metric."
+ },
+ {
+ "name": "metricName",
+ "display_name": "Metric Name",
+ "value": "rpc.rpc.datanode.RpcProcessingTimeAvgTime",
+ "type": "STRING",
+ "description": "The metric to monitor."
+ },
+ {
+ "name": "metric.deviation.warning.threshold",
+ "display_name": "The standard deviation threshold above which a warning is produced.",
+ "type": "PERCENT",
+ "units": "%",
"value": 100,
"threshold": "WARNING"
},
@@ -749,17 +893,24 @@
"units": "%",
"value": 200,
"threshold": "CRITICAL"
+ },
+ {
+ "name": "minimumValue",
+ "display_name": "Minimum Latency (in seconds)",
+ "value": 30,
+ "type": "NUMERIC",
+ "description": "Minimum latency time to measure (in seconds)."
}
]
}
},
{
- "name": "namenode_service_rpc_latency_daily",
- "label": "NameNode RPC Latency (Daily)",
- "description": "This service-level alert is triggered if the Service-RPC latency deviation has grown beyond the specified threshold within a given time interval.",
+ "name": "namenode_client_rpc_processing_latency_daily",
+ "label": "NameNode Client RPC Processing Latency (Daily)",
+ "description": "This service-level alert is triggered if the deviation of RPC latency on client port has grown beyond the specified threshold within a given time interval.",
"interval": 480,
"scope": "ANY",
- "enabled": false,
+ "enabled": true,
"source": {
"type": "SCRIPT",
"path": "HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py",
@@ -788,7 +939,7 @@
{
"name": "metricName",
"display_name": "Metric Name",
- "value": "rpc.rpc.RpcProcessingTimeAvgTime",
+ "value": "rpc.rpc.client.RpcProcessingTimeAvgTime",
"type": "STRING",
"description": "The metric to monitor."
},
@@ -807,6 +958,143 @@
"units": "%",
"value": 200,
"threshold": "CRITICAL"
+ },
+ {
+ "name": "minimumValue",
+ "display_name": "Minimum Latency (in seconds)",
+ "value": 30,
+ "type": "NUMERIC",
+ "description": "Minimum latency time to measure (in seconds)."
+ }
+ ]
+ }
+ },
+ {
+ "name": "namenode_service_rpc_queue_latency_daily",
+ "label": "NameNode Service RPC Queue Latency (Daily)",
+ "description": "This service-level alert is triggered if the deviation of RPC latency on datanode port has grown beyond the specified threshold within a given time interval.",
+ "interval": 480,
+ "scope": "ANY",
+ "enabled": true,
+ "source": {
+ "type": "SCRIPT",
+ "path": "HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py",
+ "parameters": [
+ {
+ "name": "mergeHaMetrics",
+ "display_name": "Whether active and stanby NameNodes metrics should be merged",
+ "value": "false",
+ "type": "STRING",
+ "description": "Whether active and stanby NameNodes metrics should be merged."
+ },
+ {
+ "name": "interval",
+ "display_name": "Time interval in minutes",
+ "value": 1440,
+ "type": "NUMERIC",
+ "description": "Time interval in minutes."
+ },
+ {
+ "name": "appId",
+ "display_name": "AMS application id",
+ "value": "NAMENODE",
+ "type": "STRING",
+ "description": "The application id used to retrieve the metric."
+ },
+ {
+ "name": "metricName",
+ "display_name": "Metric Name",
+ "value": "rpc.rpc.datanode.RpcQueueTimeAvgTime",
+ "type": "STRING",
+ "description": "The metric to monitor."
+ },
+ {
+ "name": "metric.deviation.warning.threshold",
+ "display_name": "The standard deviation threshold above which a warning is produced.",
+ "type": "PERCENT",
+ "units": "%",
+ "value": 100,
+ "threshold": "WARNING"
+ },
+ {
+ "name": "metric.deviation.critical.threshold",
+ "display_name": "The standard deviation threshold above which a critical alert is produced.",
+ "type": "PERCENT",
+ "units": "%",
+ "value": 200,
+ "threshold": "CRITICAL"
+ },
+ {
+ "name": "minimumValue",
+ "display_name": "Minimum Latency (in seconds)",
+ "value": 30,
+ "type": "NUMERIC",
+ "description": "Minimum latency time to measure (in seconds)."
+ }
+ ]
+ }
+ },
+ {
+ "name": "namenode_client_rpc_queue_latency_daily",
+ "label": "NameNode Client RPC Queue Latency (Daily)",
+ "description": "This service-level alert is triggered if the deviation of RPC latency on client port has grown beyond the specified threshold within a given time interval.",
+ "interval": 480,
+ "scope": "ANY",
+ "enabled": true,
+ "source": {
+ "type": "SCRIPT",
+ "path": "HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py",
+ "parameters": [
+ {
+ "name": "mergeHaMetrics",
+ "display_name": "Whether active and stanby NameNodes metrics should be merged",
+ "value": "false",
+ "type": "STRING",
+ "description": "Whether active and stanby NameNodes metrics should be merged."
+ },
+ {
+ "name": "interval",
+ "display_name": "Time interval in minutes",
+ "value": 1440,
+ "type": "NUMERIC",
+ "description": "Time interval in minutes."
+ },
+ {
+ "name": "appId",
+ "display_name": "AMS application id",
+ "value": "NAMENODE",
+ "type": "STRING",
+ "description": "The application id used to retrieve the metric."
+ },
+ {
+ "name": "metricName",
+ "display_name": "Metric Name",
+ "value": "rpc.rpc.client.RpcQueueTimeAvgTime",
+ "type": "STRING",
+ "description": "The metric to monitor."
+ },
+ {
+ "name": "metric.deviation.warning.threshold",
+ "display_name": "The standard deviation threshold above which a warning is produced.",
+ "type": "PERCENT",
+ "units": "%",
+ "value": 100,
+ "threshold": "WARNING"
+ },
+ {
+ "name": "metric.deviation.critical.threshold",
+ "display_name": "The standard deviation threshold above which a critical alert is produced.",
+ "type": "PERCENT",
+ "units": "%",
+ "value": 200,
+ "threshold": "CRITICAL"
+ },
+ {
+ "name": "minimumValue",
+ "display_name": "Minimum Latency (in seconds)",
+ "value": 30,
+ "type": "NUMERIC",
+ "description": "Minimum latency time to measure (in seconds)."
}
]
}
@@ -817,7 +1105,7 @@
"description": "This service-level alert is triggered if the increase in storage capacity usage deviation has grown beyond the specified threshold within a given time interval.",
"interval": 480,
"scope": "ANY",
- "enabled": false,
+ "enabled": true,
"source": {
"type": "SCRIPT",
"path": "HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py",
@@ -825,7 +1113,7 @@
{
"name": "mergeHaMetrics",
"display_name": "Whether active and stanby NameNodes metrics should be merged",
- "value": "true",
+ "value": "false",
"type": "STRING",
"description": "Whether active and stanby NameNodes metrics should be merged."
},
@@ -855,7 +1143,7 @@
"display_name": "The standard deviation threshold above which a warning is produced.",
"type": "PERCENT",
"units": "%",
- "value": 100,
+ "value": 30,
"threshold": "WARNING"
},
{
@@ -863,7 +1151,7 @@
"display_name": "The standard deviation threshold above which a critical alert is produced.",
"type": "PERCENT",
"units": "%",
- "value": 200,
+ "value": 50,
"threshold": "CRITICAL"
}
]
@@ -875,7 +1163,7 @@
"description": "This service-level alert is triggered if the NN heap usage deviation has grown beyond the specified threshold within a given time interval.",
"interval": 1440,
"scope": "ANY",
- "enabled": false,
+ "enabled": true,
"source": {
"type": "SCRIPT",
"path": "HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py",
@@ -913,7 +1201,7 @@
"display_name": "The standard deviation threshold above which a warning is produced.",
"type": "PERCENT",
"units": "%",
- "value": 100,
+ "value": 20,
"threshold": "WARNING"
},
{
@@ -921,7 +1209,7 @@
"display_name": "The standard deviation threshold above which a critical alert is produced.",
"type": "PERCENT",
"units": "%",
- "value": 200,
+ "value": 50,
"threshold": "CRITICAL"
}
]
@@ -933,7 +1221,7 @@
"description": "This service-level alert is triggered if the increase in storage capacity usage deviation has grown beyond the specified threshold within a given time interval.",
"interval": 1440,
"scope": "ANY",
- "enabled": false,
+ "enabled": true,
"source": {
"type": "SCRIPT",
"path": "HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py",
@@ -941,7 +1229,7 @@
{
"name": "mergeHaMetrics",
"display_name": "Whether active and stanby NameNodes metrics should be merged",
- "value": "true",
+ "value": "false",
"type": "STRING",
"description": "Whether active and stanby NameNodes metrics should be merged."
},
@@ -971,7 +1259,7 @@
"display_name": "The standard deviation threshold above which a warning is produced.",
"type": "PERCENT",
"units": "%",
- "value": 100,
+ "value": 10,
"threshold": "WARNING"
},
{
@@ -979,7 +1267,7 @@
"display_name": "The standard deviation threshold above which a critical alert is produced.",
"type": "PERCENT",
"units": "%",
- "value": 200,
+ "value": 20,
"threshold": "CRITICAL"
}
]
@@ -1161,11 +1449,11 @@
},
"warning": {
"text": "Remaining Capacity:[{0}], Total Capacity:[{2:.0f}% Used, {1}]",
- "value": 80
+ "value": 75
},
"critical": {
"text": "Remaining Capacity:[{0}], Total Capacity:[{2:.0f}% Used, {1}]",
- "value": 90
+ "value": 80
},
"units" : "%"
},
@@ -1189,6 +1477,47 @@
"type": "SCRIPT",
"path": "HDFS/2.1.0.2.0/package/alerts/alert_datanode_unmounted_data_dir.py"
}
+ },
+ {
+ "name": "datanode_heap_usage",
+ "label": "DataNode Heap Usage",
+ "description": "This host-level alert is triggered if heap usage goes past thresholds on the DataNode. It checks the DataNode JMXServlet for the MemHeapUsedM and MemHeapMaxM properties. The threshold values are in percent.",
+ "interval": 2,
+ "scope": "HOST",
+ "enabled": true,
+ "source": {
+ "type": "METRIC",
+ "uri": {
+ "http": "{{hdfs-site/dfs.datanode.http.address}}",
+ "https": "{{hdfs-site/dfs.datanode.https.address}}",
+ "kerberos_keytab": "{{hdfs-site/dfs.web.authentication.kerberos.keytab}}",
+ "kerberos_principal": "{{hdfs-site/dfs.web.authentication.kerberos.principal}}",
+ "https_property": "{{hdfs-site/dfs.http.policy}}",
+ "https_property_value": "HTTPS_ONLY",
+ "connection_timeout": 5.0
+ },
+ "reporting": {
+ "ok": {
+ "text": "Used Heap:[{2:.0f}%, {0} MB], Max Heap: {1} MB"
+ },
+ "warning": {
+ "text": "Used Heap:[{2:.0f}%, {0} MB], Max Heap: {1} MB",
+ "value": 80
+ },
+ "critical": {
+ "text": "Used Heap:[{2:.0f}%, {0} MB], Max Heap: {1} MB",
+ "value": 90
+ },
+ "units" : "%"
+ },
+ "jmx": {
+ "property_list": [
+ "Hadoop:service=DataNode,name=JvmMetrics/MemHeapUsedM",
+ "Hadoop:service=DataNode,name=JvmMetrics/MemHeapMaxM"
+ ],
+ "value": "100.0 - (({1} - {0})/{1} * 100.0)"
+ }
+ }
}
],
"ZKFC": [
http://git-wip-us.apache.org/repos/asf/ambari/blob/afb69f02/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py
index f6a9a56..f62c4a3 100644
--- a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py
+++ b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py
@@ -70,6 +70,9 @@ DEVIATION_CRITICAL_THRESHOLD_KEY = 'metric.deviation.critical.threshold'
DEVIATION_CRITICAL_THRESHOLD_DEFAULT = 10
DEVIATION_WARNING_THRESHOLD_KEY = 'metric.deviation.warning.threshold'
DEVIATION_WARNING_THRESHOLD_DEFAULT = 5
+NAMENODE_SERVICE_RPC_PORT_KEY = ''
+
+MINIMUM_VALUE_THRESHOLD_KEY = 'minimumValue'
AMS_METRICS_GET_URL = "/ws/v1/timeline/metrics?%s"
@@ -130,6 +133,10 @@ def execute(configurations={}, parameters={}, host_name=None):
if DEVIATION_CRITICAL_THRESHOLD_KEY in parameters:
critical_threshold = int(parameters[DEVIATION_CRITICAL_THRESHOLD_KEY])
+ minimum_value_threshold = None
+ if MINIMUM_VALUE_THRESHOLD_KEY in parameters:
+ minimum_value_threshold = int(parameters[MINIMUM_VALUE_THRESHOLD_KEY])
+
#parse configuration
if configurations is None:
return (RESULT_STATE_UNKNOWN, ['There were no configurations supplied to the script.'])
@@ -149,6 +156,16 @@ def execute(configurations={}, parameters={}, host_name=None):
else:
return (RESULT_STATE_UNKNOWN, ['{0} value should be set as "fqdn_hostname:port", but set to {1}'.format(METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY, configurations[METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY])])
+ namenode_service_rpc_address = None
+ # hdfs-site is required
+ if not HDFS_SITE_KEY in configurations:
+ return (RESULT_STATE_UNKNOWN, ['{0} is a required parameter for the script'.format(HDFS_SITE_KEY)])
+
+ hdfs_site = configurations[HDFS_SITE_KEY]
+
+ if 'dfs.namenode.servicerpc-address' in hdfs_site:
+ namenode_service_rpc_address = hdfs_site['dfs.namenode.servicerpc-address']
+
# if namenode alert and HA mode
if NAMESERVICE_KEY in configurations and app_id.lower() == 'namenode':
# hdfs-site is required
@@ -186,7 +203,6 @@ def execute(configurations={}, parameters={}, host_name=None):
kinit_timer_ms = parameters.get(KERBEROS_KINIT_TIMER_PARAMETER, DEFAULT_KERBEROS_KINIT_TIMER_MS)
name_service = configurations[NAMESERVICE_KEY]
- hdfs_site = configurations[HDFS_SITE_KEY]
# look for dfs.ha.namenodes.foo
nn_unique_ids_key = 'dfs.ha.namenodes.' + name_service
@@ -207,7 +223,7 @@ def execute(configurations={}, parameters={}, host_name=None):
active_namenodes = []
nn_unique_ids = hdfs_site[nn_unique_ids_key].split(',')
for nn_unique_id in nn_unique_ids:
- key = namenode_http_fragment.format(name_service,nn_unique_id)
+ key = namenode_http_fragment.format(name_service, nn_unique_id)
if key in hdfs_site:
# use str() to ensure that unicode strings do not have the u' in them
@@ -234,21 +250,32 @@ def execute(configurations={}, parameters={}, host_name=None):
if state == HDFS_NN_STATE_ACTIVE:
active_namenodes.append(namenode)
+
+ # Only check active NN
+ nn_service_rpc_address_key = 'dfs.namenode.servicerpc-address.{0}.{1}'.format(name_service, nn_unique_id)
+ if nn_service_rpc_address_key in hdfs_site:
+ namenode_service_rpc_address = hdfs_site[nn_service_rpc_address_key]
+ pass
except:
logger.exception("Unable to determine active NameNode")
-
+ pass
if merge_ha_metrics:
hostnames = ",".join(namenodes)
- # run only on active NN, no need to run the same requests from the
+ # run only on active NN, no need to run the same requests from the standby
if host_name not in active_namenodes:
return (RESULT_STATE_SKIPPED, ['Another host will report this alert'])
+ pass
+
+ # Skip service rpc alert if port is not enabled
+ if not namenode_service_rpc_address and 'rpc.rpc.datanode' in metric_name:
+ return (RESULT_STATE_SKIPPED, ['Service RPC port is not enabled.'])
get_metrics_parameters = {
"metricNames": metric_name,
"appId": app_id,
"hostname": hostnames,
- "startTime": current_time - interval*60*1000,
+ "startTime": current_time - interval * 60 * 1000,
"endTime": current_time,
"grouped": "true",
}
@@ -274,15 +301,25 @@ def execute(configurations={}, parameters={}, host_name=None):
# if host1 reports small local values, but host2 reports large local values
for metrics_data in data_json["metrics"]:
metrics += metrics_data["metrics"].values()
+ pass
if not metrics or len(metrics) < 2:
return (RESULT_STATE_UNKNOWN, ["Unable to calculate the standard deviation for {0} datapoints".format(len(metrics))])
+ # Filter out points below min threshold
+ for metric in metrics:
+ if metric <= minimum_value_threshold:
+ metrics.remove(metric)
+ pass
+
+ if len(metrics) < 2:
+ return (RESULT_STATE_SKIPPED, ['No datapoints found above the minimum threshold of {0}'.format(minimum_value_threshold)])
+
mean = calculate_mean(metrics)
stddev = calulate_sample_std_deviation(metrics)
try:
- deviation_percent = stddev/mean*100
+ deviation_percent = stddev / mean * 100
except ZeroDivisionError:
# should not be a case for this alert
return (RESULT_STATE_UNKNOWN, ["Unable to calculate the standard deviation percentage. The mean value is 0"])