You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@ambari.apache.org by sw...@apache.org on 2016/03/15 03:13:01 UTC

[1/2] ambari git commit: AMBARI-15416. Grafana password is required field after upgrade from 2.2.0 to 2.2.2. (swagle)

Repository: ambari
Updated Branches:
  refs/heads/branch-2.2 a38e3a96e -> ac740437b


AMBARI-15416. Grafana password is required field after upgrade from 2.2.0 to 2.2.2. (swagle)


Project: http://git-wip-us.apache.org/repos/asf/ambari/repo
Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/ac740437
Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/ac740437
Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/ac740437

Branch: refs/heads/branch-2.2
Commit: ac740437b162d742430c986d7fe2350684144f58
Parents: 66b3eb2
Author: Siddharth Wagle <sw...@hortonworks.com>
Authored: Mon Mar 14 19:12:26 2016 -0700
Committer: Siddharth Wagle <sw...@hortonworks.com>
Committed: Mon Mar 14 19:12:33 2016 -0700

----------------------------------------------------------------------
 .../stacks/HDP/2.0.6/services/stack_advisor.py        | 14 ++++++++++++++
 .../python/stacks/2.2/common/test_stack_advisor.py    |  7 +++++++
 2 files changed, 21 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/ambari/blob/ac740437/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/stack_advisor.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/stack_advisor.py b/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/stack_advisor.py
index 9acad1d..f15263e 100644
--- a/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/stack_advisor.py
+++ b/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/stack_advisor.py
@@ -470,6 +470,7 @@ class HDP206StackAdvisor(DefaultStackAdvisor):
     putAmsHbaseSiteProperty = self.putProperty(configurations, "ams-hbase-site", services)
     putAmsSiteProperty = self.putProperty(configurations, "ams-site", services)
     putHbaseEnvProperty = self.putProperty(configurations, "ams-hbase-env", services)
+    putGrafanaPropertyAttribute = self.putPropertyAttribute(configurations, "ams-grafana-env")
 
     amsCollectorHosts = self.getComponentHostNames(services, "AMBARI_METRICS", "METRICS_COLLECTOR")
 
@@ -633,6 +634,19 @@ class HDP206StackAdvisor(DefaultStackAdvisor):
     putAmsSiteProperty("timeline.metrics.host.aggregate.splitpoints", ','.join(precision_splits))
     putAmsSiteProperty("timeline.metrics.cluster.aggregate.splitpoints", ','.join(aggregate_splits))
 
+    component_grafana_exists = False
+    for service in services:
+      if 'components' in service:
+        for component in service['components']:
+          if 'StackServiceComponents' in component:
+            if 'METRICS_GRAFANA' in component['StackServiceComponents']['component_name']:
+              component_grafana_exists = True
+              break
+    pass
+
+    if not component_grafana_exists:
+      putGrafanaPropertyAttribute("metrics_grafana_password", "visible", "false")
+
     pass
 
   def getHostNamesWithComponent(self, serviceName, componentName, services):

http://git-wip-us.apache.org/repos/asf/ambari/blob/ac740437/ambari-server/src/test/python/stacks/2.2/common/test_stack_advisor.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/test/python/stacks/2.2/common/test_stack_advisor.py b/ambari-server/src/test/python/stacks/2.2/common/test_stack_advisor.py
index 6a8a6c8..f09198f 100644
--- a/ambari-server/src/test/python/stacks/2.2/common/test_stack_advisor.py
+++ b/ambari-server/src/test/python/stacks/2.2/common/test_stack_advisor.py
@@ -1999,6 +1999,13 @@ class TestHDP22StackAdvisor(TestCase):
           "hbase_regionserver_heapsize": "768"
         }
       },
+      "ams-grafana-env": {
+        "property_attributes": {
+          "metrics_grafana_password": {
+            "visible": "false"
+          }
+        }
+      },
       "ams-env": {
         "properties": {
           "metrics_collector_heapsize": "512",

[2/2] ambari git commit: AMBARI-15415. Fix new HDFS AMS alerts to account for NN rpc ports. (swagle)

Posted by sw...@apache.org.

AMBARI-15415. Fix new HDFS AMS alerts to account for NN rpc ports. (swagle)


Project: http://git-wip-us.apache.org/repos/asf/ambari/repo
Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/66b3eb2b
Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/66b3eb2b
Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/66b3eb2b

Branch: refs/heads/branch-2.2
Commit: 66b3eb2b6d818dcad41c9342c190ddd47cd46074
Parents: a38e3a9
Author: Siddharth Wagle <sw...@hortonworks.com>
Authored: Mon Mar 14 17:17:57 2016 -0700
Committer: Siddharth Wagle <sw...@hortonworks.com>
Committed: Mon Mar 14 19:12:33 2016 -0700

----------------------------------------------------------------------
 .../metrics/timeline/PhoenixHBaseAccessor.java  |  10 +-
 .../common-services/HDFS/2.1.0.2.0/alerts.json  | 403 +++++++++++++++++--
 .../package/alerts/alert_metrics_deviation.py   |  49 ++-
 3 files changed, 412 insertions(+), 50 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/ambari/blob/66b3eb2b/ambari-metrics/ambari-metrics-timelineservice/src/main/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/PhoenixHBaseAccessor.java
----------------------------------------------------------------------
diff --git a/ambari-metrics/ambari-metrics-timelineservice/src/main/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/PhoenixHBaseAccessor.java b/ambari-metrics/ambari-metrics-timelineservice/src/main/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/PhoenixHBaseAccessor.java
index 611cb92..dbaec55 100644
--- a/ambari-metrics/ambari-metrics-timelineservice/src/main/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/PhoenixHBaseAccessor.java
+++ b/ambari-metrics/ambari-metrics-timelineservice/src/main/java/org/apache/hadoop/yarn/server/applicationhistoryservice/metrics/timeline/PhoenixHBaseAccessor.java
@@ -339,10 +339,6 @@ public class PhoenixHBaseAccessor {
     boolean enableNormalizer = hbaseConf.getBoolean("hbase.normalizer.enabled", true);
     boolean enableFifoCompaction = metricsConf.getBoolean("timeline.metrics.hbase.fifo.compaction.enabled", true);
 
-    if (!enableNormalizer && !enableFifoCompaction) {
-      return;
-    }
-
     HBaseAdmin hBaseAdmin = null;
     try {
       hBaseAdmin = dataSource.getHBaseAdmin();
@@ -354,10 +350,10 @@ public class PhoenixHBaseAccessor {
       for (String tableName : PHOENIX_TABLES) {
         try {
           boolean modifyTable = false;
-          HTableDescriptor tableDescriptor = hBaseAdmin.getTableDescriptor(tableName.getBytes());
+          HTableDescrsiptor tableDescriptor = hBaseAdmin.getTableDescriptor
+            (tableName.getBytes());
 
-          if (enableNormalizer &&
-              !tableDescriptor.isNormalizationEnabled()) {
+          if (enableNormalizer && !tableDescriptor.isNormalizationEnabled()) {
             tableDescriptor.setNormalizationEnabled(true);
             LOG.info("Enabling normalizer for " + tableName);
             modifyTable = true;

http://git-wip-us.apache.org/repos/asf/ambari/blob/66b3eb2b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/alerts.json
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/alerts.json b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/alerts.json
index 2a6229c..2f13cd6 100644
--- a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/alerts.json
+++ b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/alerts.json
@@ -298,11 +298,11 @@
             },
             "warning": {
               "text": "Capacity Used:[{2:.0f}%, {0}], Capacity Remaining:[{1}]",
-              "value": 80
+              "value": 75
             },          
             "critical": {
               "text": "Capacity Used:[{2:.0f}%, {0}], Capacity Remaining:[{1}]",
-              "value": 90
+              "value": 80
             },
             "units" : "%"
           },
@@ -522,12 +522,12 @@
         }
       },
       {
-        "name": "increase_nn_heap_usage_hourly",
-        "label": "NameNode Heap Usage (Hourly)",
-        "description": "This service-level alert is triggered if the NN heap usage deviation has grown beyond the specified threshold within a given time interval.",
+        "name": "namenode_service_rpc_queue_latency_hourly",
+        "label": "NameNode Service RPC Queue Latency (Hourly)",
+        "description": "This service-level alert is triggered if the deviation of RPC queue latency on datanode port has grown beyond the specified threshold within a given time interval.",
         "interval": 5,
         "scope": "ANY",
-        "enabled": false,
+        "enabled": true,
         "source": {
           "type": "SCRIPT",
           "path": "HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py",
@@ -556,7 +556,7 @@
             {
               "name": "metricName",
               "display_name": "Metric Name",
-              "value": "jvm.JvmMetrics.MemHeapUsedM",
+              "value": "rpc.rpc.datanode.RpcQueueTimeAvgTime",
               "type": "STRING",
               "description": "The metric to monitor."
             },
@@ -575,17 +575,24 @@
               "units": "%",
               "value": 200,
               "threshold": "CRITICAL"
+            },
+            {
+              "name": "minimumValue",
+              "display_name": "Minimum Latency (in seconds)",
+              "value": 30,
+              "type": "NUMERIC",
+              "description": "Minimum latency time to measure (in seconds)."
             }
           ]
         }
       },
       {
-        "name": "namenode_service_rpc_latency_hourly",
-        "label": "NameNode RPC Latency (Hourly)",
-        "description": "This service-level alert is triggered if the Service-RPC latency deviation has grown beyond the specified threshold within a given time interval.",
+        "name": "namenode_client_rpc_queue_latency_hourly",
+        "label": "NameNode Client RPC Queue Latency (Hourly)",
+        "description": "This service-level alert is triggered if the deviation of RPC queue latency on client port has grown beyond the specified threshold within a given time interval.",
         "interval": 5,
         "scope": "ANY",
-        "enabled": false,
+        "enabled": true,
         "source": {
           "type": "SCRIPT",
           "path": "HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py",
@@ -614,7 +621,7 @@
             {
               "name": "metricName",
               "display_name": "Metric Name",
-              "value": "rpc.rpc.RpcProcessingTimeAvgTime",
+              "value": "rpc.rpc.client.RpcQueueTimeAvgTime",
               "type": "STRING",
               "description": "The metric to monitor."
             },
@@ -633,17 +640,24 @@
               "units": "%",
               "value": 200,
               "threshold": "CRITICAL"
+            },
+            {
+              "name": "minimumValue",
+              "display_name": "Minimum Latency (in seconds)",
+              "value": 30,
+              "type": "NUMERIC",
+              "description": "Minimum latency time to measure (in seconds)."
             }
           ]
         }
       },
       {
-        "name": "namenode_increase_in_storage_capacity_usage_hourly",
-        "label": "HDFS Storage Capacity Usage (Hourly)",
-        "description": "This service-level alert is triggered if the increase in storage capacity usage deviation has grown beyond the specified threshold within a given time interval.",
+        "name": "namenode_service_rpc_processing_latency_hourly",
+        "label": "NameNode Service RPC Processing Latency (Hourly)",
+        "description": "This service-level alert is triggered if the deviation of RPC latency on datanode port has grown beyond the specified threshold within a given time interval.",
         "interval": 5,
         "scope": "ANY",
-        "enabled": false,
+        "enabled": true,
         "source": {
           "type": "SCRIPT",
           "path": "HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py",
@@ -651,7 +665,7 @@
             {
               "name": "mergeHaMetrics",
               "display_name": "Whether active and stanby NameNodes metrics should be merged",
-              "value": "true",
+              "value": "false",
               "type": "STRING",
               "description": "Whether active and stanby NameNodes metrics should be merged."
             },
@@ -672,7 +686,7 @@
             {
               "name": "metricName",
               "display_name": "Metric Name",
-              "value": "dfs.FSNamesystem.CapacityUsed",
+              "value": "rpc.rpc.datanode.RpcProcessingTimeAvgTime",
               "type": "STRING",
               "description": "The metric to monitor."
             },
@@ -691,6 +705,78 @@
               "units": "%",
               "value": 200,
               "threshold": "CRITICAL"
+            },
+            {
+              "name": "minimumValue",
+              "display_name": "Minimum Latency (in seconds)",
+              "value": 30,
+              "type": "NUMERIC",
+              "description": "Minimum latency time to measure (in seconds)."
+            }
+          ]
+        }
+      },
+      {
+        "name": "namenode_client_rpc_processing_latency_hourly",
+        "label": "NameNode Client RPC Processing Latency (Hourly)",
+        "description": "This service-level alert is triggered if the deviation of RPC latency on client port has grown beyond the specified threshold within a given time interval.",
+        "interval": 5,
+        "scope": "ANY",
+        "enabled": true,
+        "source": {
+          "type": "SCRIPT",
+          "path": "HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py",
+          "parameters": [
+            {
+              "name": "mergeHaMetrics",
+              "display_name": "Whether active and stanby NameNodes metrics should be merged",
+              "value": "false",
+              "type": "STRING",
+              "description": "Whether active and stanby NameNodes metrics should be merged."
+            },
+            {
+              "name": "interval",
+              "display_name": "Time interval in minutes",
+              "value": 60,
+              "type": "NUMERIC",
+              "description": "Time interval in minutes."
+            },
+            {
+              "name": "appId",
+              "display_name": "AMS application id",
+              "value": "NAMENODE",
+              "type": "STRING",
+              "description": "The application id used to retrieve the metric."
+            },
+            {
+              "name": "metricName",
+              "display_name": "Metric Name",
+              "value": "rpc.rpc.client.RpcProcessingTimeAvgTime",
+              "type": "STRING",
+              "description": "The metric to monitor."
+            },
+            {
+              "name": "metric.deviation.warning.threshold",
+              "display_name": "The standard deviation threshold above which a warning is produced.",
+              "type": "PERCENT",
+              "units": "%",
+              "value": 100,
+              "threshold": "WARNING"
+            },
+            {
+              "name": "metric.deviation.critical.threshold",
+              "display_name": "The standard deviation threshold above which a critical alert is produced.",
+              "type": "PERCENT",
+              "units": "%",
+              "value": 200,
+              "threshold": "CRITICAL"
+            },
+            {
+              "name": "minimumValue",
+              "display_name": "Minimum Latency (in seconds)",
+              "value": 30,
+              "type": "NUMERIC",
+              "description": "Minimum latency time to measure (in seconds)."
             }
           ]
         }
@@ -701,7 +787,7 @@
         "description": "This service-level alert is triggered if the NN heap usage deviation has grown beyond the specified threshold within a given time interval.",
         "interval": 480,
         "scope": "ANY",
-        "enabled": false,
+        "enabled": true,
         "source": {
           "type": "SCRIPT",
           "path": "HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py",
@@ -739,6 +825,64 @@
               "display_name": "The standard deviation threshold above which a warning is produced.",
               "type": "PERCENT",
               "units": "%",
+              "value": 20,
+              "threshold": "WARNING"
+            },
+            {
+              "name": "metric.deviation.critical.threshold",
+              "display_name": "The standard deviation threshold above which a critical alert is produced.",
+              "type": "PERCENT",
+              "units": "%",
+              "value": 50,
+              "threshold": "CRITICAL"
+            }
+          ]
+        }
+      },
+      {
+        "name": "namenode_service_rpc_processing_latency_daily",
+        "label": "NameNode Service RPC Processing Latency (Daily)",
+        "description": "This service-level alert is triggered if the deviation of RPC latency on datanode port has grown beyond the specified threshold within a given time interval.",
+        "interval": 480,
+        "scope": "ANY",
+        "enabled": true,
+        "source": {
+          "type": "SCRIPT",
+          "path": "HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py",
+          "parameters": [
+            {
+              "name": "mergeHaMetrics",
+              "display_name": "Whether active and stanby NameNodes metrics should be merged",
+              "value": "false",
+              "type": "STRING",
+              "description": "Whether active and stanby NameNodes metrics should be merged."
+            },
+            {
+              "name": "interval",
+              "display_name": "Time interval in minutes",
+              "value": 1440,
+              "type": "NUMERIC",
+              "description": "Time interval in minutes."
+            },
+            {
+              "name": "appId",
+              "display_name": "AMS application id",
+              "value": "NAMENODE",
+              "type": "STRING",
+              "description": "The application id used to retrieve the metric."
+            },
+            {
+              "name": "metricName",
+              "display_name": "Metric Name",
+              "value": "rpc.rpc.datanode.RpcProcessingTimeAvgTime",
+              "type": "STRING",
+              "description": "The metric to monitor."
+            },
+            {
+              "name": "metric.deviation.warning.threshold",
+              "display_name": "The standard deviation threshold above which a warning is produced.",
+              "type": "PERCENT",
+              "units": "%",
               "value": 100,
               "threshold": "WARNING"
             },
@@ -749,17 +893,24 @@
               "units": "%",
               "value": 200,
               "threshold": "CRITICAL"
+            },
+            {
+              "name": "minimumValue",
+              "display_name": "Minimum Latency (in seconds)",
+              "value": 30,
+              "type": "NUMERIC",
+              "description": "Minimum latency time to measure (in seconds)."
             }
           ]
         }
       },
       {
-        "name": "namenode_service_rpc_latency_daily",
-        "label": "NameNode RPC Latency (Daily)",
-        "description": "This service-level alert is triggered if the Service-RPC latency deviation has grown beyond the specified threshold within a given time interval.",
+        "name": "namenode_client_rpc_processing_latency_daily",
+        "label": "NameNode Client RPC Processing Latency (Daily)",
+        "description": "This service-level alert is triggered if the deviation of RPC latency on client port has grown beyond the specified threshold within a given time interval.",
         "interval": 480,
         "scope": "ANY",
-        "enabled": false,
+        "enabled": true,
         "source": {
           "type": "SCRIPT",
           "path": "HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py",
@@ -788,7 +939,7 @@
             {
               "name": "metricName",
               "display_name": "Metric Name",
-              "value": "rpc.rpc.RpcProcessingTimeAvgTime",
+              "value": "rpc.rpc.client.RpcProcessingTimeAvgTime",
               "type": "STRING",
               "description": "The metric to monitor."
             },
@@ -807,6 +958,143 @@
               "units": "%",
               "value": 200,
               "threshold": "CRITICAL"
+            },
+            {
+              "name": "minimumValue",
+              "display_name": "Minimum Latency (in seconds)",
+              "value": 30,
+              "type": "NUMERIC",
+              "description": "Minimum latency time to measure (in seconds)."
+            }
+          ]
+        }
+      },
+      {
+        "name": "namenode_service_rpc_queue_latency_daily",
+        "label": "NameNode Service RPC Queue Latency (Daily)",
+        "description": "This service-level alert is triggered if the deviation of RPC latency on datanode port has grown beyond the specified threshold within a given time interval.",
+        "interval": 480,
+        "scope": "ANY",
+        "enabled": true,
+        "source": {
+          "type": "SCRIPT",
+          "path": "HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py",
+          "parameters": [
+            {
+              "name": "mergeHaMetrics",
+              "display_name": "Whether active and stanby NameNodes metrics should be merged",
+              "value": "false",
+              "type": "STRING",
+              "description": "Whether active and stanby NameNodes metrics should be merged."
+            },
+            {
+              "name": "interval",
+              "display_name": "Time interval in minutes",
+              "value": 1440,
+              "type": "NUMERIC",
+              "description": "Time interval in minutes."
+            },
+            {
+              "name": "appId",
+              "display_name": "AMS application id",
+              "value": "NAMENODE",
+              "type": "STRING",
+              "description": "The application id used to retrieve the metric."
+            },
+            {
+              "name": "metricName",
+              "display_name": "Metric Name",
+              "value": "rpc.rpc.datanode.RpcQueueTimeAvgTime",
+              "type": "STRING",
+              "description": "The metric to monitor."
+            },
+            {
+              "name": "metric.deviation.warning.threshold",
+              "display_name": "The standard deviation threshold above which a warning is produced.",
+              "type": "PERCENT",
+              "units": "%",
+              "value": 100,
+              "threshold": "WARNING"
+            },
+            {
+              "name": "metric.deviation.critical.threshold",
+              "display_name": "The standard deviation threshold above which a critical alert is produced.",
+              "type": "PERCENT",
+              "units": "%",
+              "value": 200,
+              "threshold": "CRITICAL"
+            },
+            {
+              "name": "minimumValue",
+              "display_name": "Minimum Latency (in seconds)",
+              "value": 30,
+              "type": "NUMERIC",
+              "description": "Minimum latency time to measure (in seconds)."
+            }
+          ]
+        }
+      },
+      {
+        "name": "namenode_client_rpc_queue_latency_daily",
+        "label": "NameNode Client RPC Queue Latency (Daily)",
+        "description": "This service-level alert is triggered if the deviation of RPC latency on client port has grown beyond the specified threshold within a given time interval.",
+        "interval": 480,
+        "scope": "ANY",
+        "enabled": true,
+        "source": {
+          "type": "SCRIPT",
+          "path": "HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py",
+          "parameters": [
+            {
+              "name": "mergeHaMetrics",
+              "display_name": "Whether active and stanby NameNodes metrics should be merged",
+              "value": "false",
+              "type": "STRING",
+              "description": "Whether active and stanby NameNodes metrics should be merged."
+            },
+            {
+              "name": "interval",
+              "display_name": "Time interval in minutes",
+              "value": 1440,
+              "type": "NUMERIC",
+              "description": "Time interval in minutes."
+            },
+            {
+              "name": "appId",
+              "display_name": "AMS application id",
+              "value": "NAMENODE",
+              "type": "STRING",
+              "description": "The application id used to retrieve the metric."
+            },
+            {
+              "name": "metricName",
+              "display_name": "Metric Name",
+              "value": "rpc.rpc.client.RpcQueueTimeAvgTime",
+              "type": "STRING",
+              "description": "The metric to monitor."
+            },
+            {
+              "name": "metric.deviation.warning.threshold",
+              "display_name": "The standard deviation threshold above which a warning is produced.",
+              "type": "PERCENT",
+              "units": "%",
+              "value": 100,
+              "threshold": "WARNING"
+            },
+            {
+              "name": "metric.deviation.critical.threshold",
+              "display_name": "The standard deviation threshold above which a critical alert is produced.",
+              "type": "PERCENT",
+              "units": "%",
+              "value": 200,
+              "threshold": "CRITICAL"
+            },
+            {
+              "name": "minimumValue",
+              "display_name": "Minimum Latency (in seconds)",
+              "value": 30,
+              "type": "NUMERIC",
+              "description": "Minimum latency time to measure (in seconds)."
             }
           ]
         }
@@ -817,7 +1105,7 @@
         "description": "This service-level alert is triggered if the increase in storage capacity usage deviation has grown beyond the specified threshold within a given time interval.",
         "interval": 480,
         "scope": "ANY",
-        "enabled": false,
+        "enabled": true,
         "source": {
           "type": "SCRIPT",
           "path": "HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py",
@@ -825,7 +1113,7 @@
             {
               "name": "mergeHaMetrics",
               "display_name": "Whether active and stanby NameNodes metrics should be merged",
-              "value": "true",
+              "value": "false",
               "type": "STRING",
               "description": "Whether active and stanby NameNodes metrics should be merged."
             },
@@ -855,7 +1143,7 @@
               "display_name": "The standard deviation threshold above which a warning is produced.",
               "type": "PERCENT",
               "units": "%",
-              "value": 100,
+              "value": 30,
               "threshold": "WARNING"
             },
             {
@@ -863,7 +1151,7 @@
               "display_name": "The standard deviation threshold above which a critical alert is produced.",
               "type": "PERCENT",
               "units": "%",
-              "value": 200,
+              "value": 50,
               "threshold": "CRITICAL"
             }
           ]
@@ -875,7 +1163,7 @@
         "description": "This service-level alert is triggered if the NN heap usage deviation has grown beyond the specified threshold within a given time interval.",
         "interval": 1440,
         "scope": "ANY",
-        "enabled": false,
+        "enabled": true,
         "source": {
           "type": "SCRIPT",
           "path": "HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py",
@@ -913,7 +1201,7 @@
               "display_name": "The standard deviation threshold above which a warning is produced.",
               "type": "PERCENT",
               "units": "%",
-              "value": 100,
+              "value": 20,
               "threshold": "WARNING"
             },
             {
@@ -921,7 +1209,7 @@
               "display_name": "The standard deviation threshold above which a critical alert is produced.",
               "type": "PERCENT",
               "units": "%",
-              "value": 200,
+              "value": 50,
               "threshold": "CRITICAL"
             }
           ]
@@ -933,7 +1221,7 @@
         "description": "This service-level alert is triggered if the increase in storage capacity usage deviation has grown beyond the specified threshold within a given time interval.",
         "interval": 1440,
         "scope": "ANY",
-        "enabled": false,
+        "enabled": true,
         "source": {
           "type": "SCRIPT",
           "path": "HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py",
@@ -941,7 +1229,7 @@
             {
               "name": "mergeHaMetrics",
               "display_name": "Whether active and stanby NameNodes metrics should be merged",
-              "value": "true",
+              "value": "false",
               "type": "STRING",
               "description": "Whether active and stanby NameNodes metrics should be merged."
             },
@@ -971,7 +1259,7 @@
               "display_name": "The standard deviation threshold above which a warning is produced.",
               "type": "PERCENT",
               "units": "%",
-              "value": 100,
+              "value": 10,
               "threshold": "WARNING"
             },
             {
@@ -979,7 +1267,7 @@
               "display_name": "The standard deviation threshold above which a critical alert is produced.",
               "type": "PERCENT",
               "units": "%",
-              "value": 200,
+              "value": 20,
               "threshold": "CRITICAL"
             }
           ]
@@ -1161,11 +1449,11 @@
             },
             "warning": {
               "text": "Remaining Capacity:[{0}], Total Capacity:[{2:.0f}% Used, {1}]",
-              "value": 80
+              "value": 75
             },
             "critical": {
               "text": "Remaining Capacity:[{0}], Total Capacity:[{2:.0f}% Used, {1}]",
-              "value": 90
+              "value": 80
             },
             "units" : "%"
           },
@@ -1189,6 +1477,47 @@
           "type": "SCRIPT",
           "path": "HDFS/2.1.0.2.0/package/alerts/alert_datanode_unmounted_data_dir.py"
         }
+      },
+      {
+        "name": "datanode_heap_usage",
+        "label": "DataNode Heap Usage",
+        "description": "This host-level alert is triggered if heap usage goes past thresholds on the DataNode. It checks the DataNode JMXServlet for the MemHeapUsedM and MemHeapMaxM properties. The threshold values are in percent.",
+        "interval": 2,
+        "scope": "HOST",
+        "enabled": true,
+        "source": {
+          "type": "METRIC",
+          "uri": {
+            "http": "{{hdfs-site/dfs.datanode.http.address}}",
+            "https": "{{hdfs-site/dfs.datanode.https.address}}",
+            "kerberos_keytab": "{{hdfs-site/dfs.web.authentication.kerberos.keytab}}",
+            "kerberos_principal": "{{hdfs-site/dfs.web.authentication.kerberos.principal}}",
+            "https_property": "{{hdfs-site/dfs.http.policy}}",
+            "https_property_value": "HTTPS_ONLY",
+            "connection_timeout": 5.0
+          },
+          "reporting": {
+            "ok": {
+              "text": "Used Heap:[{2:.0f}%, {0} MB], Max Heap: {1} MB"
+            },
+            "warning": {
+              "text": "Used Heap:[{2:.0f}%, {0} MB], Max Heap: {1} MB",
+              "value": 80
+            },
+            "critical": {
+              "text": "Used Heap:[{2:.0f}%, {0} MB], Max Heap: {1} MB",
+              "value": 90
+            },
+            "units" : "%"
+          },
+          "jmx": {
+            "property_list": [
+              "Hadoop:service=DataNode,name=JvmMetrics/MemHeapUsedM",
+              "Hadoop:service=DataNode,name=JvmMetrics/MemHeapMaxM"
+            ],
+            "value": "100.0 - (({1} - {0})/{1} * 100.0)"
+          }
+        }
       }
     ],
     "ZKFC": [

http://git-wip-us.apache.org/repos/asf/ambari/blob/66b3eb2b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py
index f6a9a56..f62c4a3 100644
--- a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py
+++ b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py
@@ -70,6 +70,9 @@ DEVIATION_CRITICAL_THRESHOLD_KEY = 'metric.deviation.critical.threshold'
 DEVIATION_CRITICAL_THRESHOLD_DEFAULT = 10
 DEVIATION_WARNING_THRESHOLD_KEY = 'metric.deviation.warning.threshold'
 DEVIATION_WARNING_THRESHOLD_DEFAULT = 5
+NAMENODE_SERVICE_RPC_PORT_KEY = ''
+
+MINIMUM_VALUE_THRESHOLD_KEY = 'minimumValue'
 
 AMS_METRICS_GET_URL = "/ws/v1/timeline/metrics?%s"
 
@@ -130,6 +133,10 @@ def execute(configurations={}, parameters={}, host_name=None):
   if DEVIATION_CRITICAL_THRESHOLD_KEY in parameters:
     critical_threshold = int(parameters[DEVIATION_CRITICAL_THRESHOLD_KEY])
 
+  minimum_value_threshold = None
+  if MINIMUM_VALUE_THRESHOLD_KEY in parameters:
+    minimum_value_threshold = int(parameters[MINIMUM_VALUE_THRESHOLD_KEY])
+
   #parse configuration
   if configurations is None:
     return (RESULT_STATE_UNKNOWN, ['There were no configurations supplied to the script.'])
@@ -149,6 +156,16 @@ def execute(configurations={}, parameters={}, host_name=None):
     else:
       return (RESULT_STATE_UNKNOWN, ['{0} value should be set as "fqdn_hostname:port", but set to {1}'.format(METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY, configurations[METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY])])
 
+  namenode_service_rpc_address = None
+  # hdfs-site is required
+  if not HDFS_SITE_KEY in configurations:
+    return (RESULT_STATE_UNKNOWN, ['{0} is a required parameter for the script'.format(HDFS_SITE_KEY)])
+
+  hdfs_site = configurations[HDFS_SITE_KEY]
+
+  if 'dfs.namenode.servicerpc-address' in hdfs_site:
+    namenode_service_rpc_address = hdfs_site['dfs.namenode.servicerpc-address']
+
   # if namenode alert and HA mode
   if NAMESERVICE_KEY in configurations and app_id.lower() == 'namenode':
     # hdfs-site is required
@@ -186,7 +203,6 @@ def execute(configurations={}, parameters={}, host_name=None):
     kinit_timer_ms = parameters.get(KERBEROS_KINIT_TIMER_PARAMETER, DEFAULT_KERBEROS_KINIT_TIMER_MS)
 
     name_service = configurations[NAMESERVICE_KEY]
-    hdfs_site = configurations[HDFS_SITE_KEY]
 
     # look for dfs.ha.namenodes.foo
     nn_unique_ids_key = 'dfs.ha.namenodes.' + name_service
@@ -207,7 +223,7 @@ def execute(configurations={}, parameters={}, host_name=None):
     active_namenodes = []
     nn_unique_ids = hdfs_site[nn_unique_ids_key].split(',')
     for nn_unique_id in nn_unique_ids:
-      key = namenode_http_fragment.format(name_service,nn_unique_id)
+      key = namenode_http_fragment.format(name_service, nn_unique_id)
 
       if key in hdfs_site:
         # use str() to ensure that unicode strings do not have the u' in them
@@ -234,21 +250,32 @@ def execute(configurations={}, parameters={}, host_name=None):
 
           if state == HDFS_NN_STATE_ACTIVE:
             active_namenodes.append(namenode)
+
+            # Only check active NN
+            nn_service_rpc_address_key = 'dfs.namenode.servicerpc-address.{0}.{1}'.format(name_service, nn_unique_id)
+            if nn_service_rpc_address_key in hdfs_site:
+              namenode_service_rpc_address = hdfs_site[nn_service_rpc_address_key]
+          pass
         except:
           logger.exception("Unable to determine active NameNode")
-
+    pass
 
     if merge_ha_metrics:
       hostnames = ",".join(namenodes)
-      # run only on active NN, no need to run the same requests from the
+      # run only on active NN, no need to run the same requests from the standby
       if host_name not in active_namenodes:
         return (RESULT_STATE_SKIPPED, ['Another host will report this alert'])
+    pass
+
+  # Skip service rpc alert if port is not enabled
+  if not namenode_service_rpc_address and 'rpc.rpc.datanode' in metric_name:
+    return (RESULT_STATE_SKIPPED, ['Service RPC port is not enabled.'])
 
   get_metrics_parameters = {
     "metricNames": metric_name,
     "appId": app_id,
     "hostname": hostnames,
-    "startTime": current_time - interval*60*1000,
+    "startTime": current_time - interval * 60 * 1000,
     "endTime": current_time,
     "grouped": "true",
     }
@@ -274,15 +301,25 @@ def execute(configurations={}, parameters={}, host_name=None):
   # if host1 reports small local values, but host2 reports large local values
   for metrics_data in data_json["metrics"]:
     metrics += metrics_data["metrics"].values()
+  pass
 
   if not metrics or len(metrics) < 2:
     return (RESULT_STATE_UNKNOWN, ["Unable to calculate the standard deviation for {0} datapoints".format(len(metrics))])
 
+  # Filter out points below min threshold
+  for metric in metrics:
+    if metric <= minimum_value_threshold:
+      metrics.remove(metric)
+  pass
+
+  if len(metrics) < 2:
+    return (RESULT_STATE_SKIPPED, ['No datapoints found above the minimum threshold of {0}'.format(minimum_value_threshold)])
+
   mean = calculate_mean(metrics)
   stddev = calulate_sample_std_deviation(metrics)
 
   try:
-    deviation_percent = stddev/mean*100
+    deviation_percent = stddev / mean * 100
   except ZeroDivisionError:
     # should not be a case for this alert
     return (RESULT_STATE_UNKNOWN, ["Unable to calculate the standard deviation percentage. The mean value is 0"])